library(tidyverse)
library(limma)
library(edgeR)
library(plyranges)
library(scales)
library(DT)
library(UpSetR)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(ggplot2)
source("/data/share/htp/prime-seq_NextGen/scripts/poolsize_functions.R")Pool Size Differential Expression Analysis
Differential expression analysis comparing different pool sizes (080ng, 320ng, and 920ng).
Setup
Data Loading
Read Count Data
counts_080 <- as.data.frame(as.matrix(readRDS("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/80ng/zUMIs_output/expression/poolsize_80ng.dgecounts.rds")$umicount$inex$all)) %>%
rownames_to_column(var="gene_id") %>%
as_tibble()
counts_320 <- as.data.frame(as.matrix(readRDS("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/320ng/zUMIs_output/expression/poolsize_320ng.dgecounts.rds")$umicount$inex$all)) %>%
rownames_to_column(var="gene_id") %>%
as_tibble()
counts_920 <- as.data.frame(as.matrix(readRDS("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/920ng/zUMIs_output/expression/poolsize_920ng.dgecounts.rds")$umicount$inex$all)) %>%
rownames_to_column(var="gene_id") %>%
as_tibble()Load Annotation Data
genes <- read.delim("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/80ng/zUMIs_output/expression/poolsize_80ng.gene_names.txt")
gtf <- read_gff("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/80ng/poolsize_80ng.final_annot.gtf")Combine Datasets
# Add pool size prefixes to sample names
colnames(counts_080)[-1] <- paste0("pool080_", colnames(counts_080)[-1])
colnames(counts_320)[-1] <- paste0("pool320_", colnames(counts_320)[-1])
colnames(counts_920)[-1] <- paste0("pool920_", colnames(counts_920)[-1])
# Find common genes across all datasets
counts <- full_join(full_join(counts_080, counts_320, by="gene_id"), counts_920, by="gene_id") %>%
mutate(across(where(is.numeric), ~replace_na(., 0))) %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, everything())
# Create sample metadata
sample_info <- data.frame(
sample = colnames(counts)[-(1:2)],
pool_size = c(rep("pool080", ncol(counts_080)-1),
rep("pool320", ncol(counts_320)-1),
rep("pool920", ncol(counts_920)-1)),
pool_size_numeric = c(rep(080, ncol(counts_080)-1),
rep(320, ncol(counts_320)-1),
rep(920, ncol(counts_920)-1))
)
# Convert pool_size to factor with correct ordering
sample_info$pool_size <- factor(sample_info$pool_size, levels = c("pool080", "pool320", "pool920"))
# Report dimensions of the datasets
print(paste("Combined dataset:", nrow(counts), "genes,", ncol(counts)-2, "samples"))[1] "Combined dataset: 24311 genes, 40 samples"
Analysis with All Samples
This section analyzes all available samples with different filtering strategies.
Quality Control - All Samples
Library Size Distribution - All Samples
lib_sizes <- colSums(counts[,-c(1:2)])
lib_df <- data.frame(
sample = names(lib_sizes),
lib_size = lib_sizes,
pool_size = sample_info$pool_size
)
p_lib <- ggplot(lib_df, aes(x = sample, y = lib_size, fill = pool_size)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Library sizes (UMI counts)",
x = "Sample", y = "UMI counts", fill = "Pool size") +
scale_fill_brewer(type = "qual", palette = "Set1") +
scale_y_continuous(labels = comma)
print(p_lib)PCA Before Filtering - All Samples
Principal Component Analysis helps visualize sample relationships and identify potential batch effects.
count_matrix_raw <- as.matrix(counts[,-c(1:2)])
rownames(count_matrix_raw) <- counts$gene_id
logcpm_raw <- cpm(count_matrix_raw, log=TRUE, prior.count=1)
# Remove genes with low variance for PCA
var_genes <- apply(logcpm_raw, 1, var)
high_var_genes <- order(var_genes, decreasing=TRUE)[1:1000]
pca_raw <- prcomp(t(logcpm_raw[high_var_genes,]), scale=TRUE)
pca_df_raw <- data.frame(
PC1 = pca_raw$x[,1],
PC2 = pca_raw$x[,2],
sample = rownames(pca_raw$x),
pool_size = sample_info$pool_size,
lib_size = lib_sizes[rownames(pca_raw$x)]
)
pc1_var <- round(summary(pca_raw)$importance[2,1]*100, 1)
pc2_var <- round(summary(pca_raw)$importance[2,2]*100, 1)
# PCA colored by pool size
ggplot(pca_df_raw, aes(x = PC1, y = PC2, color = pool_size)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA before filtering (colored by pool size)",
x = paste0("PC1 (", pc1_var, "%)"),
y = paste0("PC2 (", pc2_var, "%)"),
color = "Pool size") +
scale_color_brewer(type = "qual", palette = "Set1")# PCA colored by library size and shaped by pool size
ggplot(pca_df_raw, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
geom_point(size = 4) +
theme_minimal() +
labs(title = "PCA before filtering (colored by library size)",
x = paste0("PC1 (", pc1_var, "%)"),
y = paste0("PC2 (", pc2_var, "%)"),
color = "Library size",
shape = "Pool size") +
scale_color_gradient(low = "blue", high = "red", labels = comma) +
scale_shape_manual(values = c(16, 17, 18))Filtering - All Samples
We apply multiple filtering steps to remove uninformative genes:
- Sparse genes: Remove genes with low expression (CPM > 3 in >= 5 samples)
- optional Highly expressed genes: Remove top 10 most highly expressed genes (likely housekeeping genes)
- optional Mitochondrial genes: Remove mitochondrial genes
- optional Ribosomal RNA genes: Remove rRNA and rRNA pseudogenes
# Filter sparse genes CPM>3 in >= 5 samples
count_matrix <- as.matrix(counts[,-c(1:2)])
rownames(count_matrix) <- counts$gene_id
cpm_values <- cpm(count_matrix)
count_matrix_zero <- count_matrix
count_matrix_zero[cpm_values < 3] <- 0
keep_sparse <- rowSums(count_matrix_zero[,-c(1:2)] > 1) >= 5
print(paste("Removed sparse genes:", sum(!keep_sparse)))[1] "Removed sparse genes: 11442"
# Filter most highly expressed genes (remove top 10)
sum_expression <- rowSums(count_matrix)
gene_expression <- tibble(
gene_id = counts$gene_id,
gene_name = counts$gene_name,
total_expression = sum_expression,
perc = total_expression/(sum(count_matrix))*100
)
top_genes <- gene_expression %>%
arrange(desc(total_expression)) %>%
head(10)
knitr::kable(top_genes, caption = "Top 10 genes by total expression")| gene_id | gene_name | total_expression | perc |
|---|---|---|---|
| ENSMUSG00000064339.1 | mt-Rnr2 | 156926 | 3.3745263 |
| ENSMUSG00000101249.2 | Gm29216 | 112241 | 2.4136230 |
| ENSMUSG00000064337.1 | mt-Rnr1 | 111224 | 2.3917535 |
| ENSMUSG00000119584.1 | Rn18s-rs5 | 57056 | 1.2269284 |
| ENSMUSG00000100862.2 | Gm10925 | 52956 | 1.1387623 |
| ENSMUSG00000064370.1 | mt-Cytb | 48465 | 1.0421881 |
| ENSMUSG00000101111.2 | Gm28437 | 43057 | 0.9258949 |
| ENSMUSG00000064363.1 | mt-Nd4 | 40849 | 0.8784142 |
| ENSMUSG00000102070.2 | Gm28661 | 39144 | 0.8417500 |
| ENSMUSG00000064341.1 | mt-Nd1 | 34118 | 0.7336712 |
# Filter ribo and mito genes
all_genes <- gtf %>% filter(type=="gene") %>% as_tibble
mitochondrial_genes <- all_genes %>%
filter(seqnames == "chrM") %>%
pull(gene_id)
rrna_genes <- all_genes %>%
filter(gene_type %in% c("rRNA","rRNA_pseudogene")) %>%
pull(gene_id)
# Apply all filters
counts_filt <- counts[keep_sparse,] %>%
filter(!(gene_id %in% c(top_genes$gene_id,
mitochondrial_genes,
rrna_genes)))
counts_filt_alt <- counts[keep_sparse,]
print(paste("Filtered dataset:", nrow(counts_filt), "genes,", ncol(counts_filt)-2, "samples,", round(sum(counts_filt[,-c(1,2)])/1000000, 2), "M UMIs"))[1] "Filtered dataset: 12840 genes, 40 samples, 3.85 M UMIs"
print(paste("Filtered dataset:", nrow(counts_filt_alt), "genes,", ncol(counts_filt_alt)-2, "samples,", round(sum(counts_filt_alt[,-c(1,2)])/1000000, 2), "M UMIs"))[1] "Filtered dataset: 12869 genes, 40 samples, 4.59 M UMIs"
PCA After Filtering - All Samples
stringent
count_matrix_filt <- as.matrix(counts_filt[,-c(1:2)])
rownames(count_matrix_filt) <- counts_filt$gene_id
logcpm_filt <- cpm(count_matrix_filt, log=TRUE, prior.count=1)
# Remove genes with low variance for PCA
var_genes_filt <- apply(logcpm_filt, 1, var)
high_var_genes_filt <- order(var_genes_filt, decreasing=TRUE)[1:min(1000, nrow(logcpm_filt))]
pca_filt <- prcomp(t(logcpm_filt[high_var_genes_filt,]), scale=TRUE)
# Calculate library sizes for filtered data
lib_sizes_filt <- colSums(count_matrix_filt)
pca_df_filt <- data.frame(
PC1 = pca_filt$x[,1],
PC2 = pca_filt$x[,2],
sample = rownames(pca_filt$x),
pool_size = sample_info$pool_size,
lib_size = lib_sizes_filt[rownames(pca_filt$x)]
)
pc1_var_filt <- round(summary(pca_filt)$importance[2,1]*100, 1)
pc2_var_filt <- round(summary(pca_filt)$importance[2,2]*100, 1)
# PCA colored by library size and shaped by pool size
ggplot(pca_df_filt, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
geom_point(size = 4) +
theme_minimal() +
labs(title = "PCA after filtering (colored by library size)",
x = paste0("PC1 (", pc1_var_filt, "%)"),
y = paste0("PC2 (", pc2_var_filt, "%)"),
color = "Library size",
shape = "Pool size") +
scale_color_gradient(low = "blue", high = "red", labels = comma) +
scale_shape_manual(values = c(16, 17, 18))lenient
count_matrix_filt <- as.matrix(counts_filt_alt[,-c(1:2)])
rownames(count_matrix_filt) <- counts_filt_alt$gene_id
logcpm_filt <- cpm(count_matrix_filt, log=TRUE, prior.count=1)
# Remove genes with low variance for PCA
var_genes_filt <- apply(logcpm_filt, 1, var)
high_var_genes_filt <- order(var_genes_filt, decreasing=TRUE)[1:min(1000, nrow(logcpm_filt))]
pca_filt <- prcomp(t(logcpm_filt[high_var_genes_filt,]), scale=TRUE)
# Calculate library sizes for filtered data
lib_sizes_filt <- colSums(count_matrix_filt)
pca_df_filt <- data.frame(
PC1 = pca_filt$x[,1],
PC2 = pca_filt$x[,2],
sample = rownames(pca_filt$x),
pool_size = sample_info$pool_size,
lib_size = lib_sizes_filt[rownames(pca_filt$x)]
)
pc1_var_filt <- round(summary(pca_filt)$importance[2,1]*100, 1)
pc2_var_filt <- round(summary(pca_filt)$importance[2,2]*100, 1)
# PCA colored by library size and shaped by pool size
ggplot(pca_df_filt, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
geom_point(size = 4) +
theme_minimal() +
labs(title = "PCA after filtering (colored by library size)",
x = paste0("PC1 (", pc1_var_filt, "%)"),
y = paste0("PC2 (", pc2_var_filt, "%)"),
color = "Library size",
shape = "Pool size") +
scale_color_gradient(low = "blue", high = "red", labels = comma) +
scale_shape_manual(values = c(16, 17, 18))DEG
stringent
Model Diagnostics - All Samples
Before performing differential expression analysis.
# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_filt %>% column_to_rownames(var="gene_id"), samples = sample_info)Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)
# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info))Statistical Analysis - All Samples
We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.
# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info)
colnames(design) <- levels(sample_info$pool_size)
# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)
# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
pool320_vs_pool080 = pool320 - pool080,
pool920_vs_pool080 = pool920 - pool080,
pool920_vs_pool320 = pool920 - pool320,
levels = design
)
# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)
# Extract results for each comparison
results_320_vs_080 <- topTable(fit_contrasts, coef = "pool320_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_080 <- topTable(fit_contrasts, coef = "pool920_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_320 <- topTable(fit_contrasts, coef = "pool920_vs_pool320",
number = Inf, sort.by = "P")Results Summary - Stringent Filtering
# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080[results_320_vs_080$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))[1] "320ng vs 080ng: 3 DE genes"
print(paste(" Up-regulated:", up_320_vs_080))[1] " Up-regulated: 3"
print(paste(" Down-regulated:", down_320_vs_080))[1] " Down-regulated: 0"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080[results_920_vs_080$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))[1] "920ng vs 080ng: 27 DE genes"
print(paste(" Up-regulated:", up_920_vs_080))[1] " Up-regulated: 17"
print(paste(" Down-regulated:", down_920_vs_080))[1] " Down-regulated: 10"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320[results_920_vs_320$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))[1] "920ng vs 320ng: 15 DE genes"
print(paste(" Up-regulated:", up_920_vs_320))[1] " Up-regulated: 10"
print(paste(" Down-regulated:", down_920_vs_320))[1] " Down-regulated: 5"
lenient
Model Diagnostics - All Samples
Before performing differential expression analysis.
# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_filt_alt %>% column_to_rownames(var="gene_id"), samples = sample_info)Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)
# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info))Statistical Analysis - All Samples
We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.
# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info)
colnames(design) <- levels(sample_info$pool_size)
# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)
# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
pool320_vs_pool080 = pool320 - pool080,
pool920_vs_pool080 = pool920 - pool080,
pool920_vs_pool320 = pool920 - pool320,
levels = design
)
# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)
# Extract results for each comparison with lenient suffix
results_320_vs_080_lenient <- topTable(fit_contrasts, coef = "pool320_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_080_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_320_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool320",
number = Inf, sort.by = "P")Results Summary - Lenient Filtering
# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080_lenient[results_320_vs_080_lenient$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))[1] "320ng vs 080ng: 4 DE genes"
print(paste(" Up-regulated:", up_320_vs_080))[1] " Up-regulated: 4"
print(paste(" Down-regulated:", down_320_vs_080))[1] " Down-regulated: 0"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080_lenient[results_920_vs_080_lenient$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))[1] "920ng vs 080ng: 37 DE genes"
print(paste(" Up-regulated:", up_920_vs_080))[1] " Up-regulated: 29"
print(paste(" Down-regulated:", down_920_vs_080))[1] " Down-regulated: 8"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320_lenient[results_920_vs_320_lenient$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))[1] "920ng vs 320ng: 29 DE genes"
print(paste(" Up-regulated:", up_920_vs_320))[1] " Up-regulated: 23"
print(paste(" Down-regulated:", down_920_vs_320))[1] " Down-regulated: 6"
Analysis with Top 8 Samples
This section analyzes only the top 8 samples per condition (highest UMI counts) with different filtering strategies to address potential library size confounding effects observed in PCA.
Sample Selection Strategy
Select the top 8 samples with highest library sizes from each pool size condition.
# Calculate library sizes for all samples
lib_sizes_all <- colSums(counts[,-c(1:2)])
lib_df_all <- data.frame(
sample = names(lib_sizes_all),
lib_size = lib_sizes_all,
pool_size = sample_info$pool_size[match(names(lib_sizes_all), sample_info$sample)]
)
# Select top 8 samples per condition
top8_samples <- lib_df_all %>%
group_by(pool_size) %>%
top_n(8, lib_size) %>%
arrange(pool_size, desc(lib_size)) %>%
ungroup()
print("Top 8 samples per condition by library size:")[1] "Top 8 samples per condition by library size:"
knitr::kable(top8_samples, caption = "Top 8 samples per condition selected for analysis")| sample | lib_size | pool_size |
|---|---|---|
| pool080_GTACAAGACGTG | 196486 | pool080 |
| pool080_TGGTGGCGCTTA | 121620 | pool080 |
| pool080_TATTCCTGTAGG | 120800 | pool080 |
| pool080_CTGGCGTTGCTA | 117406 | pool080 |
| pool080_TGGCGTAACCTC | 114864 | pool080 |
| pool080_ATAGAGTACTCC | 106897 | pool080 |
| pool080_TGCTACCAACAA | 106764 | pool080 |
| pool080_CAGACAGCATAA | 77162 | pool080 |
| pool320_CTGGCGTTGCTA | 155582 | pool320 |
| pool320_TATTCCTGTAGG | 131051 | pool320 |
| pool320_TGGCGTAACCTC | 122014 | pool320 |
| pool320_CAGACAGCATAA | 121307 | pool320 |
| pool320_TGGTGGCGCTTA | 118451 | pool320 |
| pool320_GTACAAGACGTG | 114927 | pool320 |
| pool320_ATAGAGTACTCC | 100200 | pool320 |
| pool320_ATGGTCCACTCG | 94998 | pool320 |
| pool920_CTGGCGTTGCTA | 339521 | pool920 |
| pool920_TATTCCTGTAGG | 313842 | pool920 |
| pool920_GTACAAGACGTG | 284091 | pool920 |
| pool920_TGCTACCAACAA | 282487 | pool920 |
| pool920_CAGACAGCATAA | 280298 | pool920 |
| pool920_ATAGAGTACTCC | 261924 | pool920 |
| pool920_TGGTGGCGCTTA | 249454 | pool920 |
| pool920_TGGCGTAACCTC | 207290 | pool920 |
# Filter counts for top 8 samples (use original unfiltered counts)
selected_samples <- top8_samples$sample
counts_top8_unfiltered <- counts %>%
dplyr::select(gene_id, gene_name, all_of(selected_samples))
sample_info_top8 <- sample_info %>%
filter(sample %in% selected_samples) %>%
mutate(pool_size = factor(pool_size, levels = c("pool080", "pool320", "pool920")))
print(paste("Top 8 samples dataset:", nrow(counts_top8_unfiltered), "genes,", ncol(counts_top8_unfiltered)-2, "samples"))[1] "Top 8 samples dataset: 24311 genes, 24 samples"
print(paste("Sample distribution: pool080 =", sum(sample_info_top8$pool_size == "pool080"),
", pool320 =", sum(sample_info_top8$pool_size == "pool320"),
", pool920 =", sum(sample_info_top8$pool_size == "pool920")))[1] "Sample distribution: pool080 = 8 , pool320 = 8 , pool920 = 8"
## Downsample to match library sizes
# Calculate target library size (min of pool080 and pool320 top 8 samples)
pool080_lib_sizes <- colSums(counts_top8_unfiltered[, sample_info_top8$sample[sample_info_top8$pool_size == "pool080"]])
pool320_lib_sizes <- colSums(counts_top8_unfiltered[, sample_info_top8$sample[sample_info_top8$pool_size == "pool320"]])
target_lib_size <- min(c(pool080_lib_sizes, pool320_lib_sizes))
print(paste("Target library size for downsampling:", round(target_lib_size, 0)))[1] "Target library size for downsampling: 77162"
print(paste("Pool080 min library size:", round(min(pool080_lib_sizes), 0)))[1] "Pool080 min library size: 77162"
print(paste("Pool320 min library size:", round(min(pool320_lib_sizes), 0)))[1] "Pool320 min library size: 94998"
print(paste("Pool920 min library size:", round(min(pool320_lib_sizes), 0)))[1] "Pool920 min library size: 94998"
# Get pool920 samples and their current library sizes
lib_sizes <- colSums(counts_top8_unfiltered[,-c(1,2)])
print("library sizes before downsampling:")[1] "library sizes before downsampling:"
print(round(lib_sizes, 0))pool080_GTACAAGACGTG pool080_TGGTGGCGCTTA pool080_TATTCCTGTAGG
196486 121620 120800
pool080_CTGGCGTTGCTA pool080_TGGCGTAACCTC pool080_ATAGAGTACTCC
117406 114864 106897
pool080_TGCTACCAACAA pool080_CAGACAGCATAA pool320_CTGGCGTTGCTA
106764 77162 155582
pool320_TATTCCTGTAGG pool320_TGGCGTAACCTC pool320_CAGACAGCATAA
131051 122014 121307
pool320_TGGTGGCGCTTA pool320_GTACAAGACGTG pool320_ATAGAGTACTCC
118451 114927 100200
pool320_ATGGTCCACTCG pool920_CTGGCGTTGCTA pool920_TATTCCTGTAGG
94998 339521 313842
pool920_GTACAAGACGTG pool920_TGCTACCAACAA pool920_CAGACAGCATAA
284091 282487 280298
pool920_ATAGAGTACTCC pool920_TGGTGGCGCTTA pool920_TGGCGTAACCTC
261924 249454 207290
# Downsample
set.seed(123) # For reproducibility
counts_top8_downsampled <- counts_top8_unfiltered
for(sample_name in sample_info_top8$sample) {
current_lib_size <- lib_sizes[sample_name]
if(current_lib_size > target_lib_size) {
# Calculate downsampling probability
downsample_prob <- target_lib_size / current_lib_size
# Get count vector for this sample (excluding gene_id and gene_name columns)
sample_counts <- counts_top8_unfiltered[[sample_name]]
# Downsample using binomial sampling
downsampled_counts <- rbinom(n = length(sample_counts),
size = sample_counts,
prob = downsample_prob)
# Update the counts matrix
counts_top8_downsampled[[sample_name]] <- downsampled_counts
cat("Sample", sample_name, ": downsampled from", current_lib_size, "to", sum(downsampled_counts), "UMIs\n")
} else {
cat("Sample", sample_name, ": no downsampling needed (", current_lib_size, "UMIs)\n")
}
}Sample pool080_ATAGAGTACTCC : downsampled from 106897 to 77342 UMIs
Sample pool080_CAGACAGCATAA : no downsampling needed ( 77162 UMIs)
Sample pool080_CTGGCGTTGCTA : downsampled from 117406 to 77403 UMIs
Sample pool080_GTACAAGACGTG : downsampled from 196486 to 76849 UMIs
Sample pool080_TATTCCTGTAGG : downsampled from 120800 to 77158 UMIs
Sample pool080_TGCTACCAACAA : downsampled from 106764 to 77104 UMIs
Sample pool080_TGGCGTAACCTC : downsampled from 114864 to 77317 UMIs
Sample pool080_TGGTGGCGCTTA : downsampled from 121620 to 77060 UMIs
Sample pool320_ATAGAGTACTCC : downsampled from 100200 to 77158 UMIs
Sample pool320_ATGGTCCACTCG : downsampled from 94998 to 77137 UMIs
Sample pool320_CAGACAGCATAA : downsampled from 121307 to 77229 UMIs
Sample pool320_CTGGCGTTGCTA : downsampled from 155582 to 76992 UMIs
Sample pool320_GTACAAGACGTG : downsampled from 114927 to 77181 UMIs
Sample pool320_TATTCCTGTAGG : downsampled from 131051 to 77466 UMIs
Sample pool320_TGGCGTAACCTC : downsampled from 122014 to 77176 UMIs
Sample pool320_TGGTGGCGCTTA : downsampled from 118451 to 77272 UMIs
Sample pool920_ATAGAGTACTCC : downsampled from 261924 to 77166 UMIs
Sample pool920_CAGACAGCATAA : downsampled from 280298 to 77486 UMIs
Sample pool920_CTGGCGTTGCTA : downsampled from 339521 to 77235 UMIs
Sample pool920_GTACAAGACGTG : downsampled from 284091 to 77546 UMIs
Sample pool920_TATTCCTGTAGG : downsampled from 313842 to 76926 UMIs
Sample pool920_TGCTACCAACAA : downsampled from 282487 to 77385 UMIs
Sample pool920_TGGCGTAACCTC : downsampled from 207290 to 76876 UMIs
Sample pool920_TGGTGGCGCTTA : downsampled from 249454 to 76825 UMIs
# Verify downsampling results
lib_sizes_after <- colSums(counts_top8_downsampled[,-c(1,2)])
print("\nLibrary sizes after downsampling:")[1] "\nLibrary sizes after downsampling:"
print("Pool080:")[1] "Pool080:"
print(round(colSums(counts_top8_downsampled[, sample_info_top8$sample[sample_info_top8$pool_size == "pool080"]]), 0))pool080_ATAGAGTACTCC pool080_CAGACAGCATAA pool080_CTGGCGTTGCTA
77342 77162 77403
pool080_GTACAAGACGTG pool080_TATTCCTGTAGG pool080_TGCTACCAACAA
76849 77158 77104
pool080_TGGCGTAACCTC pool080_TGGTGGCGCTTA
77317 77060
print("Pool320:")[1] "Pool320:"
print(round(colSums(counts_top8_downsampled[, sample_info_top8$sample[sample_info_top8$pool_size == "pool320"]]), 0))pool320_ATAGAGTACTCC pool320_ATGGTCCACTCG pool320_CAGACAGCATAA
77158 77137 77229
pool320_CTGGCGTTGCTA pool320_GTACAAGACGTG pool320_TATTCCTGTAGG
76992 77181 77466
pool320_TGGCGTAACCTC pool320_TGGTGGCGCTTA
77176 77272
print("Pool920:")[1] "Pool920:"
print(round(colSums(counts_top8_downsampled[, sample_info_top8$sample[sample_info_top8$pool_size == "pool920"]]), 0))pool920_ATAGAGTACTCC pool920_CAGACAGCATAA pool920_CTGGCGTTGCTA
77166 77486 77235
pool920_GTACAAGACGTG pool920_TATTCCTGTAGG pool920_TGCTACCAACAA
77546 76926 77385
pool920_TGGCGTAACCTC pool920_TGGTGGCGCTTA
76876 76825
# Use downsampled counts for further analysis
counts_top8_unfiltered <- counts_top8_downsampledQuality Control - Top 8 Samples
Library Size Distribution - Top 8 Samples
lib_sizes_top8_all <- colSums(counts_top8_unfiltered[,-c(1:2)])
lib_df_top8_all <- data.frame(
sample = names(lib_sizes_top8_all),
lib_size = lib_sizes_top8_all,
pool_size = sample_info_top8$pool_size
)
ggplot(lib_df_top8_all, aes(x = sample, y = lib_size, fill = pool_size)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Library sizes - Top 8 samples per condition (after downsampling)",
x = "Sample", y = "UMI counts", fill = "Pool size") +
scale_fill_brewer(type = "qual", palette = "Set1") +
scale_y_continuous(labels = comma)# Summary statistics
cat("Library size summary statistics after downsampling:\n")Library size summary statistics after downsampling:
lib_summary <- lib_df_top8_all %>%
group_by(pool_size) %>%
summarise(
median = median(lib_size),
mean = mean(lib_size),
min = min(lib_size),
max = max(lib_size),
.groups = 'drop'
)
knitr::kable(lib_summary, caption = "Library size statistics by pool size (after downsampling)")| pool_size | median | mean | min | max |
|---|---|---|---|---|
| pool080 | 77160.0 | 77174.38 | 76849 | 77403 |
| pool320 | 77178.5 | 77201.38 | 76992 | 77466 |
| pool920 | 77200.5 | 77180.62 | 76825 | 77546 |
PCA Before Filtering - Top 8 Samples
count_matrix_top8_raw <- as.matrix(counts_top8_unfiltered[,-c(1:2)])
rownames(count_matrix_top8_raw) <- counts_top8_unfiltered$gene_id
logcpm_top8_raw <- cpm(count_matrix_top8_raw, log=TRUE, prior.count=1)
# Remove genes with low variance for PCA
var_genes_top8_raw <- apply(logcpm_top8_raw, 1, var)
high_var_genes_top8_raw <- order(var_genes_top8_raw, decreasing=TRUE)[1:min(1000, nrow(logcpm_top8_raw))]
pca_top8_raw <- prcomp(t(logcpm_top8_raw[high_var_genes_top8_raw,]), scale=TRUE)
pca_df_top8_raw <- data.frame(
PC1 = pca_top8_raw$x[,1],
PC2 = pca_top8_raw$x[,2],
sample = rownames(pca_top8_raw$x),
pool_size = sample_info_top8$pool_size,
lib_size = lib_sizes_top8_all[rownames(pca_top8_raw$x)]
)
pc1_var_top8_raw <- round(summary(pca_top8_raw)$importance[2,1]*100, 1)
pc2_var_top8_raw <- round(summary(pca_top8_raw)$importance[2,2]*100, 1)
# PCA colored by pool size
ggplot(pca_df_top8_raw, aes(x = PC1, y = PC2, color = pool_size)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA before filtering - Top 8 samples (colored by pool size)\n downsampled to match library sizes",
x = paste0("PC1 (", pc1_var_top8_raw, "%)"),
y = paste0("PC2 (", pc2_var_top8_raw, "%)"),
color = "Pool size") +
scale_color_brewer(type = "qual", palette = "Set1")# PCA colored by library size
ggplot(pca_df_top8_raw, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
geom_point(size = 4) +
theme_minimal() +
labs(title = "PCA before filtering - Top 8 samples (colored by library size)\n downsampled to match library sizes",
x = paste0("PC1 (", pc1_var_top8_raw, "%)"),
y = paste0("PC2 (", pc2_var_top8_raw, "%)"),
color = "Library size",
shape = "Pool size") +
scale_color_gradient(low = "blue", high = "red", labels = comma) +
scale_shape_manual(values = c(16, 17, 18))Filtering - Top 8 Samples
We apply multiple filtering steps to remove uninformative genes:
- Sparse genes: Remove genes with low expression (CPM > 3 in >= 5 samples)
- optional Highly expressed genes: Remove top 10 most highly expressed genes (likely housekeeping genes)
- optional Mitochondrial genes: Remove mitochondrial genes
- optional Ribosomal RNA genes: Remove rRNA and rRNA pseudogenes
# Step 1: Filter sparse genes CPM>3 in >= 5 samples (for 24 samples)
count_matrix_top8_unfiltered <- as.matrix(counts_top8_unfiltered[,-c(1:2)])
rownames(count_matrix_top8_unfiltered) <- counts_top8_unfiltered$gene_id
cpm_values_top8 <- cpm(count_matrix_top8_unfiltered)
count_matrix_top8_zero <- count_matrix_top8_unfiltered
count_matrix_top8_zero[cpm_values_top8 < 3] <- 0
keep_sparse_top8 <- rowSums(count_matrix_top8_zero > 1) >= 5
print(paste("Genes before sparse filtering (top 8 samples):", nrow(counts_top8_unfiltered)))[1] "Genes before sparse filtering (top 8 samples): 24311"
print(paste("Removed sparse genes (top 8 samples):", sum(!keep_sparse_top8)))[1] "Removed sparse genes (top 8 samples): 13960"
# Step 2: Filter most highly expressed genes (remove top 10)
sum_expression_top8 <- rowSums(count_matrix_top8_unfiltered)
gene_expression_top8 <- tibble(
gene_id = counts_top8_unfiltered$gene_id,
gene_name = counts_top8_unfiltered$gene_name,
total_expression = sum_expression_top8,
perc = total_expression/(sum(count_matrix_top8_unfiltered))*100
)
top_genes_top8 <- gene_expression_top8 %>%
arrange(desc(total_expression)) %>%
head(10)
print("Top 10 genes by total expression (top 8 samples):")[1] "Top 10 genes by total expression (top 8 samples):"
knitr::kable(top_genes_top8, caption = "Top 10 genes by total expression - Top 8 samples")| gene_id | gene_name | total_expression | perc |
|---|---|---|---|
| ENSMUSG00000064339.1 | mt-Rnr2 | 58311 | 3.1477756 |
| ENSMUSG00000101249.2 | Gm29216 | 44755 | 2.4159883 |
| ENSMUSG00000064337.1 | mt-Rnr1 | 42655 | 2.3026250 |
| ENSMUSG00000119584.1 | Rn18s-rs5 | 23552 | 1.2713967 |
| ENSMUSG00000100862.2 | Gm10925 | 19462 | 1.0506081 |
| ENSMUSG00000064370.1 | mt-Cytb | 18945 | 1.0226991 |
| ENSMUSG00000101111.2 | Gm28437 | 16042 | 0.8659878 |
| ENSMUSG00000064363.1 | mt-Nd4 | 15631 | 0.8438010 |
| ENSMUSG00000102070.2 | Gm28661 | 14834 | 0.8007769 |
| ENSMUSG00000064341.1 | mt-Nd1 | 13212 | 0.7132172 |
# Step 3: Filter ribo and mito genes (same as before)
mitochondrial_genes <- all_genes %>%
filter(seqnames == "chrM") %>%
pull(gene_id)
rrna_genes <- all_genes %>%
filter(gene_type %in% c("rRNA","rRNA_pseudogene")) %>%
pull(gene_id)
# Apply all filters
counts_top8_stringent <- counts_top8_unfiltered[keep_sparse_top8,] %>%
filter(!(gene_id %in% c(top_genes_top8$gene_id,
mitochondrial_genes,
rrna_genes)))
counts_top8_lenient <- counts_top8_unfiltered[keep_sparse_top8,]
print(paste("Original dataset (top 8 samples):", nrow(counts_top8_unfiltered), "genes"))[1] "Original dataset (top 8 samples): 24311 genes"
print(paste("Stringent filtered dataset (top 8 samples):", nrow(counts_top8_stringent), "genes"))[1] "Stringent filtered dataset (top 8 samples): 10330 genes"
print(paste("Stringent filtered dataset (top 8 samples):", nrow(counts_top8_lenient), "genes"))[1] "Stringent filtered dataset (top 8 samples): 10351 genes"
PCA after Filtering - Top 8 Samples
stringent
count_matrix_top8 <- as.matrix(counts_top8_stringent[,-c(1:2)])
rownames(count_matrix_top8) <- counts_top8_stringent$gene_id
logcpm_top8 <- cpm(count_matrix_top8, log=TRUE, prior.count=1)
# Remove genes with low variance for PCA
var_genes_top8 <- apply(logcpm_top8, 1, var)
high_var_genes_top8 <- order(var_genes_top8, decreasing=TRUE)[1:min(1000, nrow(logcpm_top8))]
pca_top8 <- prcomp(t(logcpm_top8[high_var_genes_top8,]), scale=TRUE)
pca_df_top8 <- data.frame(
PC1 = pca_top8$x[,1],
PC2 = pca_top8$x[,2],
sample = rownames(pca_top8$x),
pool_size = sample_info_top8$pool_size,
lib_size = lib_sizes_top8_all[rownames(pca_top8$x)]
)
pc1_var_top8 <- round(summary(pca_top8)$importance[2,1]*100, 1)
pc2_var_top8 <- round(summary(pca_top8)$importance[2,2]*100, 1)
# PCA colored by pool size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = pool_size)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA with top 8 samples per condition (colored by pool size)\n downsampled to match library sizes",
x = paste0("PC1 (", pc1_var_top8, "%)"),
y = paste0("PC2 (", pc2_var_top8, "%)"),
color = "Pool size") +
scale_color_brewer(type = "qual", palette = "Set1")# PCA colored by library size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
geom_point(size = 4) +
theme_minimal() +
labs(title = "PCA with top 8 samples per condition (colored by library size)\n
downsampled to match library sizes",
x = paste0("PC1 (", pc1_var_top8, "%)"),
y = paste0("PC2 (", pc2_var_top8, "%)"),
color = "Library size",
shape = "Pool size") +
scale_color_gradient(low = "blue", high = "red", labels = comma) +
scale_shape_manual(values = c(16, 17, 18))lenient
count_matrix_top8 <- as.matrix(counts_top8_lenient[,-c(1:2)])
rownames(count_matrix_top8) <- counts_top8_lenient$gene_id
logcpm_top8 <- cpm(count_matrix_top8, log=TRUE, prior.count=1)
# Remove genes with low variance for PCA
var_genes_top8 <- apply(logcpm_top8, 1, var)
high_var_genes_top8 <- order(var_genes_top8, decreasing=TRUE)[1:min(1000, nrow(logcpm_top8))]
pca_top8 <- prcomp(t(logcpm_top8[high_var_genes_top8,]), scale=TRUE)
pca_df_top8 <- data.frame(
PC1 = pca_top8$x[,1],
PC2 = pca_top8$x[,2],
sample = rownames(pca_top8$x),
pool_size = sample_info_top8$pool_size,
lib_size = lib_sizes_top8_all[rownames(pca_top8$x)]
)
pc1_var_top8 <- round(summary(pca_top8)$importance[2,1]*100, 1)
pc2_var_top8 <- round(summary(pca_top8)$importance[2,2]*100, 1)
# PCA colored by pool size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = pool_size)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA with top 8 samples per condition (colored by pool size)\n downsampled to match library sizes",
x = paste0("PC1 (", pc1_var_top8, "%)"),
y = paste0("PC2 (", pc2_var_top8, "%)"),
color = "Pool size") +
scale_color_brewer(type = "qual", palette = "Set1")# PCA colored by library size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
geom_point(size = 4) +
theme_minimal() +
labs(title = "PCA with top 8 samples per condition (colored by library size)\n downsampled to match library sizes",
x = paste0("PC1 (", pc1_var_top8, "%)"),
y = paste0("PC2 (", pc2_var_top8, "%)"),
color = "Library size",
shape = "Pool size") +
scale_color_gradient(low = "blue", high = "red", labels = comma) +
scale_shape_manual(values = c(16, 17, 18))DEG - Top 8 Samples
stringent
Model Diagnostics - All Samples
Before performing differential expression analysis.
# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_top8_stringent %>% column_to_rownames(var="gene_id"), samples = sample_info_top8)Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)
# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info_top8))Statistical Analysis - All Samples
We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.
# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info_top8)
colnames(design) <- levels(sample_info_top8$pool_size)
# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)
# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
pool320_vs_pool080 = pool320 - pool080,
pool920_vs_pool080 = pool920 - pool080,
pool920_vs_pool320 = pool920 - pool320,
levels = design
)
# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)
# Extract results for each comparison with top8 stringent suffix
results_320_vs_080_top8_stringent <- topTable(fit_contrasts, coef = "pool320_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_080_top8_stringent <- topTable(fit_contrasts, coef = "pool920_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_320_top8_stringent <- topTable(fit_contrasts, coef = "pool920_vs_pool320",
number = Inf, sort.by = "P")Results Summary - Stringent Filtering
# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080_top8_stringent[results_320_vs_080_top8_stringent$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))[1] "320ng vs 080ng: 7 DE genes"
print(paste(" Up-regulated:", up_320_vs_080))[1] " Up-regulated: 6"
print(paste(" Down-regulated:", down_320_vs_080))[1] " Down-regulated: 1"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080_top8_stringent[results_920_vs_080_top8_stringent$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))[1] "920ng vs 080ng: 100 DE genes"
print(paste(" Up-regulated:", up_920_vs_080))[1] " Up-regulated: 63"
print(paste(" Down-regulated:", down_920_vs_080))[1] " Down-regulated: 37"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320_top8_stringent[results_920_vs_320_top8_stringent$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))[1] "920ng vs 320ng: 35 DE genes"
print(paste(" Up-regulated:", up_920_vs_320))[1] " Up-regulated: 21"
print(paste(" Down-regulated:", down_920_vs_320))[1] " Down-regulated: 14"
lenient
Model Diagnostics - All Samples
Before performing differential expression analysis.
# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_top8_lenient %>% column_to_rownames(var="gene_id"), samples = sample_info_top8)Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)
# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info_top8))Statistical Analysis - All Samples
We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.
# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info_top8)
colnames(design) <- levels(sample_info_top8$pool_size)
# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)
# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
pool320_vs_pool080 = pool320 - pool080,
pool920_vs_pool080 = pool920 - pool080,
pool920_vs_pool320 = pool920 - pool320,
levels = design
)
# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)
# Extract results for each comparison with top8 lenient suffix
results_320_vs_080_top8_lenient <- topTable(fit_contrasts, coef = "pool320_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_080_top8_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool080",
number = Inf, sort.by = "P")
results_920_vs_320_top8_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool320",
number = Inf, sort.by = "P")Results Summary - Lenient Filtering
# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080_top8_lenient[results_320_vs_080_top8_lenient$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))[1] "320ng vs 080ng: 13 DE genes"
print(paste(" Up-regulated:", up_320_vs_080))[1] " Up-regulated: 12"
print(paste(" Down-regulated:", down_320_vs_080))[1] " Down-regulated: 1"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080_top8_lenient[results_920_vs_080_top8_lenient$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))[1] "920ng vs 080ng: 116 DE genes"
print(paste(" Up-regulated:", up_920_vs_080))[1] " Up-regulated: 78"
print(paste(" Down-regulated:", down_920_vs_080))[1] " Down-regulated: 38"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320_top8_lenient[results_920_vs_320_top8_lenient$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))[1] "920ng vs 320ng: 49 DE genes"
print(paste(" Up-regulated:", up_920_vs_320))[1] " Up-regulated: 34"
print(paste(" Down-regulated:", down_920_vs_320))[1] " Down-regulated: 15"
DE Gene Results Tables
All Samples - Stringent Filtering
320ng vs 080ng - All Samples Stringent
# Get significant genes and add gene names
sig_genes_320_vs_080_all_stringent <- results_320_vs_080 %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (All samples - stringent):", nrow(sig_genes_320_vs_080_all_stringent)))[1] "Number of significant genes (All samples - stringent): 3"
if(nrow(sig_genes_320_vs_080_all_stringent) > 0) {
DT::datatable(sig_genes_320_vs_080_all_stringent,
caption = "DE genes: 320ng vs 080ng (All samples - stringent filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 080ng - All Samples Stringent
sig_genes_920_vs_080_all_stringent <- results_920_vs_080 %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (All samples - stringent):", nrow(sig_genes_920_vs_080_all_stringent)))[1] "Number of significant genes (All samples - stringent): 27"
if(nrow(sig_genes_920_vs_080_all_stringent) > 0) {
DT::datatable(sig_genes_920_vs_080_all_stringent,
caption = "DE genes: 920ng vs 080ng (All samples - stringent filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 320ng - All Samples Stringent
sig_genes_920_vs_320_all_stringent <- results_920_vs_320 %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (All samples - stringent):", nrow(sig_genes_920_vs_320_all_stringent)))[1] "Number of significant genes (All samples - stringent): 15"
if(nrow(sig_genes_920_vs_320_all_stringent) > 0) {
DT::datatable(sig_genes_920_vs_320_all_stringent,
caption = "DE genes: 920ng vs 320ng (All samples - stringent filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}All Samples - Lenient Filtering
320ng vs 080ng - All Samples Lenient
sig_genes_320_vs_080_all_lenient <- results_320_vs_080_lenient %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (All samples - lenient):", nrow(sig_genes_320_vs_080_all_lenient)))[1] "Number of significant genes (All samples - lenient): 4"
if(nrow(sig_genes_320_vs_080_all_lenient) > 0) {
DT::datatable(sig_genes_320_vs_080_all_lenient,
caption = "DE genes: 320ng vs 080ng (All samples - lenient filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 080ng - All Samples Lenient
sig_genes_920_vs_080_all_lenient <- results_920_vs_080_lenient %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (All samples - lenient):", nrow(sig_genes_920_vs_080_all_lenient)))[1] "Number of significant genes (All samples - lenient): 37"
if(nrow(sig_genes_920_vs_080_all_lenient) > 0) {
DT::datatable(sig_genes_920_vs_080_all_lenient,
caption = "DE genes: 920ng vs 080ng (All samples - lenient filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 320ng - All Samples Lenient
sig_genes_920_vs_320_all_lenient <- results_920_vs_320_lenient %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (All samples - lenient):", nrow(sig_genes_920_vs_320_all_lenient)))[1] "Number of significant genes (All samples - lenient): 29"
if(nrow(sig_genes_920_vs_320_all_lenient) > 0) {
DT::datatable(sig_genes_920_vs_320_all_lenient,
caption = "DE genes: 920ng vs 320ng (All samples - lenient filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}Top 8 Samples - Stringent Filtering
320ng vs 080ng - Top 8 Stringent
sig_genes_320_vs_080_top8_stringent <- results_320_vs_080_top8_stringent %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (Top 8 samples - stringent):", nrow(sig_genes_320_vs_080_top8_stringent)))[1] "Number of significant genes (Top 8 samples - stringent): 7"
if(nrow(sig_genes_320_vs_080_top8_stringent) > 0) {
DT::datatable(sig_genes_320_vs_080_top8_stringent,
caption = "DE genes: 320ng vs 080ng (Top 8 samples - stringent filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 080ng - Top 8 Stringent
sig_genes_920_vs_080_top8_stringent <- results_920_vs_080_top8_stringent %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (Top 8 samples - stringent):", nrow(sig_genes_920_vs_080_top8_stringent)))[1] "Number of significant genes (Top 8 samples - stringent): 100"
if(nrow(sig_genes_920_vs_080_top8_stringent) > 0) {
DT::datatable(sig_genes_920_vs_080_top8_stringent,
caption = "DE genes: 920ng vs 080ng (Top 8 samples - stringent filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 320ng - Top 8 Stringent
sig_genes_920_vs_320_top8_stringent <- results_920_vs_320_top8_stringent %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (Top 8 samples - stringent):", nrow(sig_genes_920_vs_320_top8_stringent)))[1] "Number of significant genes (Top 8 samples - stringent): 35"
if(nrow(sig_genes_920_vs_320_top8_stringent) > 0) {
DT::datatable(sig_genes_920_vs_320_top8_stringent,
caption = "DE genes: 920ng vs 320ng (Top 8 samples - stringent filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}Top 8 Samples - Lenient Filtering
320ng vs 080ng - Top 8 Lenient
sig_genes_320_vs_080_top8_lenient <- results_320_vs_080_top8_lenient %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (Top 8 samples - lenient):", nrow(sig_genes_320_vs_080_top8_lenient)))[1] "Number of significant genes (Top 8 samples - lenient): 13"
if(nrow(sig_genes_320_vs_080_top8_lenient) > 0) {
DT::datatable(sig_genes_320_vs_080_top8_lenient,
caption = "DE genes: 320ng vs 080ng (Top 8 samples - lenient filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 080ng - Top 8 Lenient
sig_genes_920_vs_080_top8_lenient <- results_920_vs_080_top8_lenient %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (Top 8 samples - lenient):", nrow(sig_genes_920_vs_080_top8_lenient)))[1] "Number of significant genes (Top 8 samples - lenient): 116"
if(nrow(sig_genes_920_vs_080_top8_lenient) > 0) {
DT::datatable(sig_genes_920_vs_080_top8_lenient,
caption = "DE genes: 920ng vs 080ng (Top 8 samples - lenient filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}920ng vs 320ng - Top 8 Lenient
sig_genes_920_vs_320_top8_lenient <- results_920_vs_320_top8_lenient %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id") %>%
dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
arrange(desc(abs(logFC)))
print(paste("Number of significant genes (Top 8 samples - lenient):", nrow(sig_genes_920_vs_320_top8_lenient)))[1] "Number of significant genes (Top 8 samples - lenient): 49"
if(nrow(sig_genes_920_vs_320_top8_lenient) > 0) {
DT::datatable(sig_genes_920_vs_320_top8_lenient,
caption = "DE genes: 920ng vs 320ng (Top 8 samples - lenient filtering, FDR < 0.05)",
options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
filter = 'top') %>%
DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
cat("No significant genes found.")
}DE Gene Overlap Analysis
UpSet Plot of DE Gene Overlaps
This section visualizes the overlap of differentially expressed genes across all 12 analysis conditions using an UpSet plot.
# Extract significant gene IDs from all 12 conditions
de_gene_lists <- list(
"All_Stringent_320v080" = rownames(results_320_vs_080[results_320_vs_080$adj.P.Val < 0.05, ]),
"All_Stringent_920v080" = rownames(results_920_vs_080[results_920_vs_080$adj.P.Val < 0.05, ]),
"All_Stringent_920v320" = rownames(results_920_vs_320[results_920_vs_320$adj.P.Val < 0.05, ]),
"All_Lenient_320v080" = rownames(results_320_vs_080_lenient[results_320_vs_080_lenient$adj.P.Val < 0.05, ]),
"All_Lenient_920v080" = rownames(results_920_vs_080_lenient[results_920_vs_080_lenient$adj.P.Val < 0.05, ]),
"All_Lenient_920v320" = rownames(results_920_vs_320_lenient[results_920_vs_320_lenient$adj.P.Val < 0.05, ]),
"Top8_Stringent_320v080" = rownames(results_320_vs_080_top8_stringent[results_320_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
"Top8_Stringent_920v080" = rownames(results_920_vs_080_top8_stringent[results_920_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
"Top8_Stringent_920v320" = rownames(results_920_vs_320_top8_stringent[results_920_vs_320_top8_stringent$adj.P.Val < 0.05, ]),
"Top8_Lenient_320v080" = rownames(results_320_vs_080_top8_lenient[results_320_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
"Top8_Lenient_920v080" = rownames(results_920_vs_080_top8_lenient[results_920_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
"Top8_Lenient_920v320" = rownames(results_920_vs_320_top8_lenient[results_920_vs_320_top8_lenient$adj.P.Val < 0.05, ])
)
de_gene_lists <- list(
"All_Stringent_320v080" = rownames(results_320_vs_080[results_320_vs_080$adj.P.Val < 0.05, ]),
"All_Lenient_320v080" = rownames(results_320_vs_080_lenient[results_320_vs_080_lenient$adj.P.Val < 0.05, ]),
"Top8_Stringent_320v080" = rownames(results_320_vs_080_top8_stringent[results_320_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
"Top8_Lenient_320v080" = rownames(results_320_vs_080_top8_lenient[results_320_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
"All_Stringent_920v080" = rownames(results_920_vs_080[results_920_vs_080$adj.P.Val < 0.05, ]),
"All_Lenient_920v080" = rownames(results_920_vs_080_lenient[results_920_vs_080_lenient$adj.P.Val < 0.05, ]),
"Top8_Stringent_920v080" = rownames(results_920_vs_080_top8_stringent[results_920_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
"Top8_Lenient_920v080" = rownames(results_920_vs_080_top8_lenient[results_920_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
"All_Stringent_920v320" = rownames(results_920_vs_320[results_920_vs_320$adj.P.Val < 0.05, ]),
"All_Lenient_920v320" = rownames(results_920_vs_320_lenient[results_920_vs_320_lenient$adj.P.Val < 0.05, ]),
"Top8_Stringent_920v320" = rownames(results_920_vs_320_top8_stringent[results_920_vs_320_top8_stringent$adj.P.Val < 0.05, ]),
"Top8_Lenient_920v320" = rownames(results_920_vs_320_top8_lenient[results_920_vs_320_top8_lenient$adj.P.Val < 0.05, ])
)
# Print summary of DE genes per condition
cat("DE genes per condition (FDR < 0.05):\n")DE genes per condition (FDR < 0.05):
sapply(de_gene_lists, length) All_Stringent_320v080 All_Lenient_320v080 Top8_Stringent_320v080
3 4 7
Top8_Lenient_320v080 All_Stringent_920v080 All_Lenient_920v080
13 27 37
Top8_Stringent_920v080 Top8_Lenient_920v080 All_Stringent_920v320
100 116 15
All_Lenient_920v320 Top8_Stringent_920v320 Top8_Lenient_920v320
29 35 49
# Create UpSet plot
upset(fromList(de_gene_lists),
order.by = "freq",
nsets = 12,
sets = names(de_gene_lists),
keep.order = TRUE,
nintersects = 30,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.3, 1.3, 1, 1, 2, 1.5),
set_size.show = TRUE,
set_size.scale_max = max(sapply(de_gene_lists, length)) * 1.1)Summary of DE Gene Overlaps
# Create a summary table of overlaps
overlap_matrix <- fromList(de_gene_lists)
# Calculate total unique genes
total_unique_genes <- nrow(overlap_matrix)
cat("Total unique DE genes across all conditions:", total_unique_genes, "\n\n")Total unique DE genes across all conditions: 133
# Find genes that are DE in all contrasts for each analysis type
all_contrasts_all_stringent <- rownames(overlap_matrix)[
overlap_matrix$All_Stringent_320v080 == 1 &
overlap_matrix$All_Stringent_920v080 == 1 &
overlap_matrix$All_Stringent_920v320 == 1]
all_contrasts_all_lenient <- rownames(overlap_matrix)[
overlap_matrix$All_Lenient_320v080 == 1 &
overlap_matrix$All_Lenient_920v080 == 1 &
overlap_matrix$All_Lenient_920v320 == 1]
all_contrasts_top8_stringent <- rownames(overlap_matrix)[
overlap_matrix$Top8_Stringent_320v080 == 1 &
overlap_matrix$Top8_Stringent_920v080 == 1 &
overlap_matrix$Top8_Stringent_920v320 == 1]
all_contrasts_top8_lenient <- rownames(overlap_matrix)[
overlap_matrix$Top8_Lenient_320v080 == 1 &
overlap_matrix$Top8_Lenient_920v080 == 1 &
overlap_matrix$Top8_Lenient_920v320 == 1]
# Find genes that are consistently DE across all 12 conditions
consistent_genes <- rownames(overlap_matrix)[rowSums(overlap_matrix) == 12]
cat("Genes DE in all 3 contrasts per analysis type:\n")Genes DE in all 3 contrasts per analysis type:
cat("All samples - stringent filtering:", length(all_contrasts_all_stringent), "genes\n")All samples - stringent filtering: 0 genes
cat("All samples - lenient filtering:", length(all_contrasts_all_lenient), "genes\n")All samples - lenient filtering: 1 genes
cat("Top 8 samples - stringent filtering:", length(all_contrasts_top8_stringent), "genes\n")Top 8 samples - stringent filtering: 0 genes
cat("Top 8 samples - lenient filtering:", length(all_contrasts_top8_lenient), "genes\n\n")Top 8 samples - lenient filtering: 6 genes
cat("Genes consistently DE across ALL 12 conditions:", length(consistent_genes), "genes\n")Genes consistently DE across ALL 12 conditions: 0 genes
if(length(consistent_genes) > 0) {
cat("Consistently DE genes:\n")
consistent_genes_with_names <- genes[genes$gene_id %in% consistent_genes, c("gene_id", "gene_name")]
knitr::kable(consistent_genes_with_names, caption = "Genes DE in all 12 analysis conditions")
}Analysis-Specific Overlaps
# Create separate UpSet plots for different groupings
# All stringent
all_stringent_lists <- de_gene_lists[grepl("^All_.*Stringent", names(de_gene_lists))]
# All lenient
all_lenient_lists <- de_gene_lists[grepl("^All_.*Lenient", names(de_gene_lists))]
# Top 8 stringent
top8_stringent_lists <- de_gene_lists[grepl("^Top8_.*Stringent", names(de_gene_lists))]
# Top 8 lenient
top8_lenient_lists <- de_gene_lists[grepl("^Top8_.*Lenient", names(de_gene_lists))]
# 1. All samples (stringent vs lenient)
all_samples_lists <- de_gene_lists[grepl("^All_", names(de_gene_lists))]
# 2. Top 8 samples (stringent vs lenient)
top8_samples_lists <- de_gene_lists[grepl("^Top8_", names(de_gene_lists))]
# 3. Stringent filtering (all vs top8)
stringent_lists <- de_gene_lists[grepl("Stringent", names(de_gene_lists))]
# 4. Lenient filtering (all vs top8)
lenient_lists <- de_gene_lists[grepl("Lenient", names(de_gene_lists))]
# All samples comparison
upset(fromList(all_samples_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")All stringent
upset(fromList(all_stringent_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")All lenient
upset(fromList(all_lenient_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")Top8 stringent
upset(fromList(top8_stringent_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")Top8 lenient
upset(fromList(top8_lenient_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")Top 8 Samples: Stringent vs Lenient Filtering
# Top 8 samples comparison
upset(fromList(top8_samples_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")Stringent Filtering: All vs Top 8 Samples
# Stringent filtering comparison
upset(fromList(stringent_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")Lenient Filtering: All vs Top 8 Samples
# Lenient filtering comparison
upset(fromList(lenient_lists),
order.by = "freq",
nsets = 6,
nintersects = 20,
sets.bar.color = "#56B4E9",
main.bar.color = "#E69F00",
matrix.color = "#E69F00",
text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
set_size.show = TRUE,
mainbar.y.label = "Intersection Size",
sets.x.label = "Set Size")Gene Set Enrichment Analysis
Configuration: Select DEG Dataset for GO Enrichment
# USER CONFIGURATION: Choose which DEG dataset to use for GO enrichment
# Options:
# 1. "all_stringent" - All samples with stringent filtering
# 2. "all_lenient" - All samples with lenient filtering
# 3. "top8_stringent" - Top 8 samples with stringent filtering
# 4. "top8_lenient" - Top 8 samples with lenient filtering
selected_deg_dataset <- "top8_lenient" # Change this to select different dataset
# USER CONFIGURATION: Choose GO category for enrichment analysis
# Options:
# - "BP" - Biological Process
# - "MF" - Molecular Function
# - "CC" - Cellular Component
selected_go_category <- "MF" # Change this to select different GO category
# Define the results objects based on selection
if(selected_deg_dataset == "all_stringent") {
selected_results_320_080 <- results_320_vs_080
selected_results_920_080 <- results_920_vs_080
selected_results_920_320 <- results_920_vs_320
dataset_description <- "All samples with stringent filtering"
} else if(selected_deg_dataset == "all_lenient") {
selected_results_320_080 <- results_320_vs_080_lenient
selected_results_920_080 <- results_920_vs_080_lenient
selected_results_920_320 <- results_920_vs_320_lenient
dataset_description <- "All samples with lenient filtering"
} else if(selected_deg_dataset == "top8_stringent") {
selected_results_320_080 <- results_320_vs_080_top8_stringent
selected_results_920_080 <- results_920_vs_080_top8_stringent
selected_results_920_320 <- results_920_vs_320_top8_stringent
dataset_description <- "Top 8 samples with stringent filtering"
} else if(selected_deg_dataset == "top8_lenient") {
selected_results_320_080 <- results_320_vs_080_top8_lenient
selected_results_920_080 <- results_920_vs_080_top8_lenient
selected_results_920_320 <- results_920_vs_320_top8_lenient
dataset_description <- "Top 8 samples with lenient filtering"
} else {
stop("Invalid selection. Choose from: all_stringent, all_lenient, top8_stringent, top8_lenient")
}
# Define GO category description
if(selected_go_category == "BP") {
go_description <- "Biological Process"
} else if(selected_go_category == "MF") {
go_description <- "Molecular Function"
} else if(selected_go_category == "CC") {
go_description <- "Cellular Component"
} else {
stop("Invalid GO category. Choose from: BP, MF, CC")
}
cat("Selected DEG dataset for GO enrichment:", dataset_description, "\n")Selected DEG dataset for GO enrichment: Top 8 samples with lenient filtering
cat("Selected GO category:", go_description, "(", selected_go_category, ")\n")Selected GO category: Molecular Function ( MF )
cat("Change variables above to use different dataset or GO category\n\n")Change variables above to use different dataset or GO category
GO Enrichment Analysis - Selected Dataset
This section performs Gene Ontology (GO) enrichment analysis on the differentially expressed genes from the selected dataset: Top 8 samples with lenient filtering using Molecular Function terms.
Gene ID Conversion and Background
# Convert gene symbols to Entrez IDs for mouse
# First, let's check what type of gene IDs we have
head(genes) gene_id gene_name
1 ENSMUSG00000102693.2 4933401J01Rik
2 ENSMUSG00000064842.3 Gm26206
3 ENSMUSG00000051951.6 Xkr4
4 ENSMUSG00000102851.2 Gm18956
5 ENSMUSG00000103377.2 Gm37180
6 ENSMUSG00000104017.2 Gm37363
# Create condition-specific expressed gene sets
# Define expressed genes for each pool size condition (mean CPM > 1 within condition)
# Get samples for each condition
pool080_samples <- sample_info$sample[sample_info$pool_size == "pool080"]
pool320_samples <- sample_info$sample[sample_info$pool_size == "pool320"]
pool920_samples <- sample_info$sample[sample_info$pool_size == "pool920"]
# Calculate condition-specific expression
count_matrix_lenient <- as.matrix(counts_filt_alt[,-c(1:2)])
rownames(count_matrix_lenient) <- counts_filt_alt$gene_id
cpm_lenient <- cpm(count_matrix_lenient)
# Get expressed genes per condition (mean CPM > 1 within condition)
expressed_pool080 <- rownames(cpm_lenient)[apply(cpm_lenient[, pool080_samples], 1, function(x) mean(x) > 1)]
expressed_pool320 <- rownames(cpm_lenient)[apply(cpm_lenient[, pool320_samples], 1, function(x) mean(x) > 1)]
expressed_pool920 <- rownames(cpm_lenient)[apply(cpm_lenient[, pool920_samples], 1, function(x) mean(x) > 1)]
# Create pairwise universes (genes expressed in both conditions of each contrast)
universe_320_080_genes <- intersect(expressed_pool320, expressed_pool080)
universe_920_080_genes <- intersect(expressed_pool920, expressed_pool080)
universe_920_320_genes <- intersect(expressed_pool920, expressed_pool320)
print(paste("Genes expressed in pool080:", length(expressed_pool080)))[1] "Genes expressed in pool080: 12817"
print(paste("Genes expressed in pool320:", length(expressed_pool320)))[1] "Genes expressed in pool320: 12837"
print(paste("Genes expressed in pool920:", length(expressed_pool920)))[1] "Genes expressed in pool920: 12867"
print(paste("Universe 320v080 (genes expressed in both):", length(universe_320_080_genes)))[1] "Universe 320v080 (genes expressed in both): 12789"
print(paste("Universe 920v080 (genes expressed in both):", length(universe_920_080_genes)))[1] "Universe 920v080 (genes expressed in both): 12816"
print(paste("Universe 920v320 (genes expressed in both):", length(universe_920_320_genes)))[1] "Universe 920v320 (genes expressed in both): 12835"
# Convert to gene names and then to Entrez IDs for each universe
get_entrez_universe <- function(gene_ids) {
gene_names <- genes$gene_name[genes$gene_id %in% gene_ids]
entrez_ids <- mapIds(org.Mm.eg.db,
keys = gene_names,
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
return(entrez_ids[!is.na(entrez_ids)])
}
universe_320_080_entrez <- get_entrez_universe(universe_320_080_genes)
universe_920_080_entrez <- get_entrez_universe(universe_920_080_genes)
universe_920_320_entrez <- get_entrez_universe(universe_920_320_genes)
print(paste("Universe 320v080 with Entrez IDs:", length(universe_320_080_entrez)))[1] "Universe 320v080 with Entrez IDs: 12292"
print(paste("Universe 920v080 with Entrez IDs:", length(universe_920_080_entrez)))[1] "Universe 920v080 with Entrez IDs: 12314"
print(paste("Universe 920v320 with Entrez IDs:", length(universe_920_320_entrez)))[1] "Universe 920v320 with Entrez IDs: 12322"
320ng vs 080ng GO Enrichment
# Get significant genes from selected dataset
sig_genes_320_vs_080 <- selected_results_320_080 %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id")
print(paste("DE genes for 320ng vs 080ng (", dataset_description, "):", nrow(sig_genes_320_vs_080)))[1] "DE genes for 320ng vs 080ng ( Top 8 samples with lenient filtering ): 13"
if(nrow(sig_genes_320_vs_080) > 0) {
# Convert to Entrez IDs
de_genes_entrez_320_080 <- mapIds(org.Mm.eg.db,
keys = sig_genes_320_vs_080$gene_name,
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
# Remove NA values
de_genes_entrez_320_080 <- de_genes_entrez_320_080[!is.na(de_genes_entrez_320_080)]
print(paste("DE genes with Entrez IDs:", length(de_genes_entrez_320_080)))
if(length(de_genes_entrez_320_080) > 5) {
# Use condition-specific universe (genes expressed in both 320ng and 080ng conditions)
print(paste("Universe size (genes expressed in both 320ng and 080ng):", length(universe_320_080_entrez)))
print(paste("DE genes for enrichment:", length(de_genes_entrez_320_080)))
# GO enrichment analysis with condition-specific background
go_results_320_080 <- enrichGO(gene = de_genes_entrez_320_080,
universe = universe_320_080_entrez,
OrgDb = org.Mm.eg.db,
ont = selected_go_category, # Use selected GO category
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)
if(nrow(go_results_320_080) > 0) {
# Display results table
knitr::kable(head(go_results_320_080@result, 10),
caption = paste("Top 10 GO", go_description, "terms - 320ng vs 080ng (", dataset_description, ")"))
# Dot plot
p1 <- dotplot(go_results_320_080, showCategory = 15) +
ggtitle(paste("GO", selected_go_category, "Enrichment: 320ng vs 080ng\n", dataset_description))
print(p1)
# Bar plot
p2 <- barplot(go_results_320_080, showCategory = 15) +
ggtitle(paste("GO", selected_go_category, "Enrichment: 320ng vs 080ng\n", dataset_description))
print(p2)
# Enrichment map (if enough terms)
if(nrow(go_results_320_080) >= 5) {
p3 <- emapplot(pairwise_termsim(go_results_320_080), showCategory = 20) +
ggtitle(paste("GO", selected_go_category, "Enrichment Map: 320ng vs 080ng\n", dataset_description))
print(p3)
}
} else {
cat("No significant GO terms found for 320ng vs 080ng")
}
} else {
cat("Not enough genes with Entrez IDs for enrichment analysis")
}
} else {
cat("No DE genes found for 320ng vs 080ng")
}'select()' returned 1:1 mapping between keys and columns
[1] "DE genes with Entrez IDs: 5"
Not enough genes with Entrez IDs for enrichment analysis
920ng vs 080ng GO Enrichment
# Get significant genes from selected dataset
sig_genes_920_vs_080 <- selected_results_920_080 %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id")
print(paste("DE genes for 920ng vs 080ng (", dataset_description, "):", nrow(sig_genes_920_vs_080)))[1] "DE genes for 920ng vs 080ng ( Top 8 samples with lenient filtering ): 116"
if(nrow(sig_genes_920_vs_080) > 0) {
# Convert to Entrez IDs
de_genes_entrez_920_080 <- mapIds(org.Mm.eg.db,
keys = sig_genes_920_vs_080$gene_name,
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
# Remove NA values
de_genes_entrez_920_080 <- de_genes_entrez_920_080[!is.na(de_genes_entrez_920_080)]
print(paste("DE genes with Entrez IDs:", length(de_genes_entrez_920_080)))
if(length(de_genes_entrez_920_080) > 5) {
# Use condition-specific universe (genes expressed in both 920ng and 080ng conditions)
print(paste("Universe size (genes expressed in both 920ng and 080ng):", length(universe_920_080_entrez)))
print(paste("DE genes for enrichment:", length(de_genes_entrez_920_080)))
# GO enrichment analysis with condition-specific background
go_results_920_080 <- enrichGO(gene = de_genes_entrez_920_080,
universe = universe_920_080_entrez,
OrgDb = org.Mm.eg.db,
ont = selected_go_category, # Use selected GO category
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)
if(nrow(go_results_920_080) > 0) {
# Display results table
knitr::kable(head(go_results_920_080@result, 10),
caption = paste("Top 10 GO", go_description, "terms - 920ng vs 080ng (", dataset_description, ")"))
# Dot plot
p1 <- dotplot(go_results_920_080, showCategory = 15) +
ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 080ng\n", dataset_description))
print(p1)
# Bar plot
p2 <- barplot(go_results_920_080, showCategory = 15) +
ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 080ng\n", dataset_description))
print(p2)
# Enrichment map (if enough terms)
if(nrow(go_results_920_080) >= 5) {
p3 <- emapplot(pairwise_termsim(go_results_920_080), showCategory = 20) +
ggtitle(paste("GO", selected_go_category, "Enrichment Map: 920ng vs 080ng\n", dataset_description))
print(p3)
}
} else {
cat("No significant GO terms found for 920ng vs 080ng")
}
} else {
cat("Not enough genes with Entrez IDs for enrichment analysis")
}
} else {
cat("No DE genes found for 920ng vs 080ng")
}'select()' returned 1:1 mapping between keys and columns
[1] "DE genes with Entrez IDs: 97"
[1] "Universe size (genes expressed in both 920ng and 080ng): 12314"
[1] "DE genes for enrichment: 97"
920ng vs 320ng GO Enrichment
# Get significant genes from selected dataset
sig_genes_920_vs_320 <- selected_results_920_320 %>%
filter(adj.P.Val < 0.05) %>%
rownames_to_column("gene_id") %>%
left_join(genes, by="gene_id")
print(paste("DE genes for 920ng vs 320ng (", dataset_description, "):", nrow(sig_genes_920_vs_320)))[1] "DE genes for 920ng vs 320ng ( Top 8 samples with lenient filtering ): 49"
if(nrow(sig_genes_920_vs_320) > 0) {
# Convert to Entrez IDs
de_genes_entrez_920_320 <- mapIds(org.Mm.eg.db,
keys = sig_genes_920_vs_320$gene_name,
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
# Remove NA values
de_genes_entrez_920_320 <- de_genes_entrez_920_320[!is.na(de_genes_entrez_920_320)]
print(paste("DE genes with Entrez IDs:", length(de_genes_entrez_920_320)))
if(length(de_genes_entrez_920_320) > 5) {
# Use condition-specific universe (genes expressed in both 920ng and 320ng conditions)
print(paste("Universe size (genes expressed in both 920ng and 320ng):", length(universe_920_320_entrez)))
print(paste("DE genes for enrichment:", length(de_genes_entrez_920_320)))
# GO enrichment analysis with condition-specific background
go_results_920_320 <- enrichGO(gene = de_genes_entrez_920_320,
universe = universe_920_320_entrez,
OrgDb = org.Mm.eg.db,
ont = selected_go_category, # Use selected GO category
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)
if(nrow(go_results_920_320) > 0) {
# Display results table
knitr::kable(head(go_results_920_320@result, 10),
caption = paste("Top 10 GO", go_description, "terms - 920ng vs 320ng (", dataset_description, ")"))
# Dot plot
p1 <- dotplot(go_results_920_320, showCategory = 15) +
ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 320ng\n", dataset_description))
print(p1)
# Bar plot
p2 <- barplot(go_results_920_320, showCategory = 15) +
ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 320ng\n", dataset_description))
print(p2)
# Enrichment map (if enough terms)
if(nrow(go_results_920_320) >= 5) {
p3 <- emapplot(pairwise_termsim(go_results_920_320), showCategory = 20) +
ggtitle(paste("GO", selected_go_category, "Enrichment Map: 920ng vs 320ng\n", dataset_description))
print(p3)
}
} else {
cat("No significant GO terms found for 920ng vs 320ng")
}
} else {
cat("Not enough genes with Entrez IDs for enrichment analysis")
}
} else {
cat("No DE genes found for 920ng vs 320ng")
}'select()' returned 1:1 mapping between keys and columns
[1] "DE genes with Entrez IDs: 34"
[1] "Universe size (genes expressed in both 920ng and 320ng): 12322"
[1] "DE genes for enrichment: 34"
GO Enrichment Summary
# Summary of enrichment results for selected dataset
cat("=== GO ENRICHMENT SUMMARY ===\n")=== GO ENRICHMENT SUMMARY ===
cat("Selected dataset:", dataset_description, "\n")Selected dataset: Top 8 samples with lenient filtering
cat("Selected GO category:", go_description, "(", selected_go_category, ")\n")Selected GO category: Molecular Function ( MF )
cat("Background strategy: Genes expressed in both conditions of each contrast\n\n")Background strategy: Genes expressed in both conditions of each contrast
if(exists("go_results_320_080")) {
cat("320ng vs 080ng: ", nrow(go_results_320_080), " significant GO terms\n")
}
if(exists("go_results_920_080")) {
cat("920ng vs 080ng: ", nrow(go_results_920_080), " significant GO terms\n")
}920ng vs 080ng: 2 significant GO terms
if(exists("go_results_920_320")) {
cat("920ng vs 320ng: ", nrow(go_results_920_320), " significant GO terms\n")
}920ng vs 320ng: 3 significant GO terms
DE Gene Count Visualization
Number of Up/Down-regulated Genes per Contrast
This section creates barplots showing the number of up- and down-regulated genes for each contrast across all analysis setups.
# Function to extract DE gene counts
get_de_counts <- function(results_list, setup_name) {
contrasts <- c("320v080", "920v080", "920v320")
de_counts <- data.frame(
Setup = character(),
Contrast = character(),
Direction = character(),
Count = numeric(),
stringsAsFactors = FALSE
)
for(i in 1:length(results_list)) {
result <- results_list[[i]]
contrast <- contrasts[i]
# Get significant genes
sig_genes <- result[result$adj.P.Val < 0.05, ]
# Count up and down regulated
up_count <- sum(sig_genes$logFC > 0, na.rm = TRUE)
down_count <- sum(sig_genes$logFC < 0, na.rm = TRUE)
# Add to data frame
de_counts <- rbind(de_counts,
data.frame(Setup = setup_name,
Contrast = contrast,
Direction = "Up-regulated",
Count = up_count),
data.frame(Setup = setup_name,
Contrast = contrast,
Direction = "Down-regulated",
Count = -down_count)) # Negative for plotting below x-axis
}
return(de_counts)
}
# Extract DE counts for all setups
all_stringent_counts <- get_de_counts(
list(results_320_vs_080, results_920_vs_080, results_920_vs_320),
"All Samples - Stringent"
)
all_lenient_counts <- get_de_counts(
list(results_320_vs_080_lenient, results_920_vs_080_lenient, results_920_vs_320_lenient),
"All Samples - Lenient"
)
top8_stringent_counts <- get_de_counts(
list(results_320_vs_080_top8_stringent, results_920_vs_080_top8_stringent, results_920_vs_320_top8_stringent),
"Top 8 Samples - Stringent"
)
top8_lenient_counts <- get_de_counts(
list(results_320_vs_080_top8_lenient, results_920_vs_080_top8_lenient, results_920_vs_320_top8_lenient),
"Top 8 Samples - Lenient"
)
# Combine all data
all_de_counts <- rbind(all_stringent_counts, all_lenient_counts, top8_stringent_counts, top8_lenient_counts)
# Create individual plots for each setup
library(ggplot2)
# 1. All Samples - Stringent
p1 <- ggplot(all_stringent_counts, aes(x = Contrast, y = Count, fill = Direction)) +
geom_bar(stat = "identity", position = "identity", width = 0.7) +
geom_hline(yintercept = 0, color = "black", size = 0.5) +
scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
labs(title = "All Samples - Stringent Filtering",
x = "Contrast",
y = "Number of DE genes",
fill = "Regulation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "bottom") +
geom_text(aes(label = abs(Count)), vjust = ifelse(all_stringent_counts$Count > 0, -0.5, 1.5),
size = 4, fontface = "bold") +
scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# 2. All Samples - Lenient
p2 <- ggplot(all_lenient_counts, aes(x = Contrast, y = Count, fill = Direction)) +
geom_bar(stat = "identity", position = "identity", width = 0.7) +
geom_hline(yintercept = 0, color = "black", size = 0.5) +
scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
labs(title = "All Samples - Lenient Filtering",
x = "Contrast",
y = "Number of DE genes",
fill = "Regulation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "bottom") +
geom_text(aes(label = abs(Count)), vjust = ifelse(all_lenient_counts$Count > 0, -0.5, 1.5),
size = 4, fontface = "bold") +
scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))
# 3. Top 8 Samples - Stringent
p3 <- ggplot(top8_stringent_counts, aes(x = Contrast, y = Count, fill = Direction)) +
geom_bar(stat = "identity", position = "identity", width = 0.7) +
geom_hline(yintercept = 0, color = "black", size = 0.5) +
scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
labs(title = "Top 8 Samples - Stringent Filtering",
x = "Contrast",
y = "Number of DE genes",
fill = "Regulation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "bottom") +
geom_text(aes(label = abs(Count)), vjust = ifelse(top8_stringent_counts$Count > 0, -0.5, 1.5),
size = 4, fontface = "bold") +
scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))
# 4. Top 8 Samples - Lenient
p4 <- ggplot(top8_lenient_counts, aes(x = Contrast, y = Count, fill = Direction)) +
geom_bar(stat = "identity", position = "identity", width = 0.7) +
geom_hline(yintercept = 0, color = "black", size = 0.5) +
scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
labs(title = "Top 8 Samples - Lenient Filtering",
x = "Contrast",
y = "Number of DE genes",
fill = "Regulation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "bottom") +
geom_text(aes(label = abs(Count)), vjust = ifelse(top8_lenient_counts$Count > 0, -0.5, 1.5),
size = 4, fontface = "bold") +
scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))
# Display all plots
library(gridExtra)
Attaching package: 'gridExtra'
The following object is masked from 'package:Biobase':
combine
The following object is masked from 'package:BiocGenerics':
combine
The following object is masked from 'package:dplyr':
combine
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)# Print summary table
cat("=== DE GENE COUNT SUMMARY ===\n")=== DE GENE COUNT SUMMARY ===
summary_table <- all_de_counts %>%
mutate(Count = abs(Count)) %>%
pivot_wider(names_from = Direction, values_from = Count) %>%
mutate(Total = `Up-regulated` + `Down-regulated`) %>%
arrange(Setup, Contrast)
knitr::kable(summary_table, caption = "Summary of DE gene counts across all analysis setups")| Setup | Contrast | Up-regulated | Down-regulated | Total |
|---|---|---|---|---|
| All Samples - Lenient | 320v080 | 4 | 0 | 4 |
| All Samples - Lenient | 920v080 | 29 | 8 | 37 |
| All Samples - Lenient | 920v320 | 23 | 6 | 29 |
| All Samples - Stringent | 320v080 | 3 | 0 | 3 |
| All Samples - Stringent | 920v080 | 17 | 10 | 27 |
| All Samples - Stringent | 920v320 | 10 | 5 | 15 |
| Top 8 Samples - Lenient | 320v080 | 12 | 1 | 13 |
| Top 8 Samples - Lenient | 920v080 | 78 | 38 | 116 |
| Top 8 Samples - Lenient | 920v320 | 34 | 15 | 49 |
| Top 8 Samples - Stringent | 320v080 | 6 | 1 | 7 |
| Top 8 Samples - Stringent | 920v080 | 63 | 37 | 100 |
| Top 8 Samples - Stringent | 920v320 | 21 | 14 | 35 |
Individual Setup Plots
All Samples - Stringent Filtering
print(p1)All Samples - Lenient Filtering
print(p2)Top 8 Samples - Stringent Filtering
print(p3)Top 8 Samples - Lenient Filtering
print(p4)